{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h1> 2. Creating a sampled dataset </h1>\n",
    "\n",
    "This notebook illustrates:\n",
    "<ol>\n",
    "<li> Sampling a BigQuery dataset to create datasets for ML\n",
    "<li> Preprocessing with Pandas\n",
    "</ol>"
   ]
  },
  {
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
 "!sudo chown -R jupyter:jupyter /home/jupyter/training-data-analyst"
   ]
  },
  {

"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
 "# Ensure the right version of Tensorflow is installed.\n",
 "!pip freeze | grep tensorflow==2.1"
 ]
},

  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# change these to try this notebook out\n",
    "BUCKET = 'cloud-training-demos-ml'\n",
    "PROJECT = 'cloud-training-demos'\n",
    "REGION = 'us-central1'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ['BUCKET'] = BUCKET\n",
    "os.environ['PROJECT'] = PROJECT\n",
    "os.environ['REGION'] = REGION"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%bash\n",
    "if ! gcloud storage ls | grep -q gs://${BUCKET}/; then\n",    "  gcloud storage buckets create --location=${REGION} gs://${BUCKET}\n",    "fi"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h2> Create ML dataset by sampling using BigQuery </h2>\n",
    "<p>\n",
    "Let's sample the BigQuery data to create smaller datasets.\n",
    "</p>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create SQL query using natality data after the year 2000\n",
    "from google.cloud import bigquery\n",
    "query = \"\"\"\n",
    "SELECT\n",
    "  weight_pounds,\n",
    "  is_male,\n",
    "  mother_age,\n",
    "  plurality,\n",
    "  gestation_weeks,\n",
    "  FARM_FINGERPRINT(CONCAT(CAST(YEAR AS STRING), CAST(month AS STRING))) AS hashmonth\n",
    "FROM\n",
    "  publicdata.samples.natality\n",
    "WHERE year > 2000\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "There are only a limited number of years and months in the dataset. Let's see what the hashmonths are."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "There are 96 unique hashmonths.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>hashmonth</th>\n",
       "      <th>num_babies</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6392072535155213407</td>\n",
       "      <td>323758</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>8387817883864991792</td>\n",
       "      <td>331629</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8391424625589759186</td>\n",
       "      <td>364497</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>9183605629983195042</td>\n",
       "      <td>329975</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>328012383083104805</td>\n",
       "      <td>359891</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             hashmonth  num_babies\n",
       "0  6392072535155213407      323758\n",
       "1  8387817883864991792      331629\n",
       "2  8391424625589759186      364497\n",
       "3  9183605629983195042      329975\n",
       "4   328012383083104805      359891"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Call BigQuery but GROUP BY the hashmonth and see number of records for each group to enable us to get the correct train and evaluation percentages\n",
    "df = bigquery.Client().query(\"SELECT hashmonth, COUNT(weight_pounds) AS num_babies FROM (\" + query + \") GROUP BY hashmonth\").to_dataframe()\n",
    "print(\"There are {} unique hashmonths.\".format(len(df)))\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here's a way to get a well distributed portion of the data in such a way that the test and train sets do not overlap:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "There are 13405 examples in the train dataset and 3262 in the eval dataset\n"
     ]
    }
   ],
   "source": [
    "# Added the RAND() so that we can now subsample from each of the hashmonths to get approximately the record counts we want\n",
    "trainQuery = \"SELECT * FROM (\" + query + \") WHERE ABS(MOD(hashmonth, 4)) < 3 AND RAND() < 0.0005\"\n",
    "evalQuery = \"SELECT * FROM (\" + query + \") WHERE ABS(MOD(hashmonth, 4)) = 3 AND RAND() < 0.0005\"\n",
    "traindf = bigquery.Client().query(trainQuery).to_dataframe()\n",
    "evaldf = bigquery.Client().query(evalQuery).to_dataframe()\n",
    "print(\"There are {} examples in the train dataset and {} in the eval dataset\".format(len(traindf), len(evaldf)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h2> Preprocess data using Pandas </h2>\n",
    "<p>\n",
    "Let's add extra rows to simulate the lack of ultrasound. In the process, we'll also change the plurality column to be a string."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>weight_pounds</th>\n",
       "      <th>is_male</th>\n",
       "      <th>mother_age</th>\n",
       "      <th>plurality</th>\n",
       "      <th>gestation_weeks</th>\n",
       "      <th>hashmonth</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7.874912</td>\n",
       "      <td>True</td>\n",
       "      <td>27</td>\n",
       "      <td>1</td>\n",
       "      <td>40.0</td>\n",
       "      <td>774501970389208065</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>9.312326</td>\n",
       "      <td>True</td>\n",
       "      <td>33</td>\n",
       "      <td>1</td>\n",
       "      <td>38.0</td>\n",
       "      <td>774501970389208065</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9.376260</td>\n",
       "      <td>True</td>\n",
       "      <td>31</td>\n",
       "      <td>1</td>\n",
       "      <td>40.0</td>\n",
       "      <td>774501970389208065</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>7.374463</td>\n",
       "      <td>True</td>\n",
       "      <td>31</td>\n",
       "      <td>1</td>\n",
       "      <td>38.0</td>\n",
       "      <td>774501970389208065</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>8.509843</td>\n",
       "      <td>False</td>\n",
       "      <td>34</td>\n",
       "      <td>1</td>\n",
       "      <td>38.0</td>\n",
       "      <td>774501970389208065</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   weight_pounds  is_male  mother_age  plurality  gestation_weeks  \\\n",
       "0       7.874912     True          27          1             40.0   \n",
       "1       9.312326     True          33          1             38.0   \n",
       "2       9.376260     True          31          1             40.0   \n",
       "3       7.374463     True          31          1             38.0   \n",
       "4       8.509843    False          34          1             38.0   \n",
       "\n",
       "            hashmonth  \n",
       "0  774501970389208065  \n",
       "1  774501970389208065  \n",
       "2  774501970389208065  \n",
       "3  774501970389208065  \n",
       "4  774501970389208065  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "traindf.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Also notice that there are some very important numeric fields that are missing in some rows (the count in Pandas doesn't count missing data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>weight_pounds</th>\n",
       "      <th>mother_age</th>\n",
       "      <th>plurality</th>\n",
       "      <th>gestation_weeks</th>\n",
       "      <th>hashmonth</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>13391.000000</td>\n",
       "      <td>13405.000000</td>\n",
       "      <td>13405.000000</td>\n",
       "      <td>13314.000000</td>\n",
       "      <td>1.340500e+04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>7.238220</td>\n",
       "      <td>27.337635</td>\n",
       "      <td>1.036255</td>\n",
       "      <td>38.614466</td>\n",
       "      <td>4.403132e+18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>1.328578</td>\n",
       "      <td>6.170848</td>\n",
       "      <td>0.196276</td>\n",
       "      <td>2.576437</td>\n",
       "      <td>2.786276e+18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.500449</td>\n",
       "      <td>12.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>17.000000</td>\n",
       "      <td>1.244589e+17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>6.563162</td>\n",
       "      <td>22.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>38.000000</td>\n",
       "      <td>1.622638e+18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>7.312733</td>\n",
       "      <td>27.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>39.000000</td>\n",
       "      <td>4.329667e+18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>8.062305</td>\n",
       "      <td>32.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>40.000000</td>\n",
       "      <td>7.108882e+18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>13.459221</td>\n",
       "      <td>50.000000</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>47.000000</td>\n",
       "      <td>9.183606e+18</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       weight_pounds    mother_age     plurality  gestation_weeks  \\\n",
       "count   13391.000000  13405.000000  13405.000000     13314.000000   \n",
       "mean        7.238220     27.337635      1.036255        38.614466   \n",
       "std         1.328578      6.170848      0.196276         2.576437   \n",
       "min         0.500449     12.000000      1.000000        17.000000   \n",
       "25%         6.563162     22.000000      1.000000        38.000000   \n",
       "50%         7.312733     27.000000      1.000000        39.000000   \n",
       "75%         8.062305     32.000000      1.000000        40.000000   \n",
       "max        13.459221     50.000000      4.000000        47.000000   \n",
       "\n",
       "          hashmonth  \n",
       "count  1.340500e+04  \n",
       "mean   4.403132e+18  \n",
       "std    2.786276e+18  \n",
       "min    1.244589e+17  \n",
       "25%    1.622638e+18  \n",
       "50%    4.329667e+18  \n",
       "75%    7.108882e+18  \n",
       "max    9.183606e+18  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Let's look at a small sample of the training data\n",
    "traindf.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# It is always crucial to clean raw data before using in ML, so we have a preprocessing step\n",
    "import pandas as pd\n",
    "def preprocess(df):\n",
    "  # clean up data we don't want to train on\n",
    "  # in other words, users will have to tell us the mother's age\n",
    "  # otherwise, our ML service won't work.\n",
    "  # these were chosen because they are such good predictors\n",
    "  # and because these are easy enough to collect\n",
    "  df = df[df.weight_pounds > 0]\n",
    "  df = df[df.mother_age > 0]\n",
    "  df = df[df.gestation_weeks > 0]\n",
    "  df = df[df.plurality > 0]\n",
    "  \n",
    "  # modify plurality field to be a string\n",
    "  twins_etc = dict(zip([1,2,3,4,5],\n",
    "                   ['Single(1)', 'Twins(2)', 'Triplets(3)', 'Quadruplets(4)', 'Quintuplets(5)']))\n",
    "  df['plurality'].replace(twins_etc, inplace=True)\n",
    "  \n",
    "  # now create extra rows to simulate lack of ultrasound\n",
    "  nous = df.copy(deep=True)\n",
    "  nous.loc[nous['plurality'] != 'Single(1)', 'plurality'] = 'Multiple(2+)'\n",
    "  nous['is_male'] = 'Unknown'\n",
    "  \n",
    "  return pd.concat([df, nous])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>weight_pounds</th>\n",
       "      <th>is_male</th>\n",
       "      <th>mother_age</th>\n",
       "      <th>plurality</th>\n",
       "      <th>gestation_weeks</th>\n",
       "      <th>hashmonth</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7.874912</td>\n",
       "      <td>True</td>\n",
       "      <td>27</td>\n",
       "      <td>Single(1)</td>\n",
       "      <td>40.0</td>\n",
       "      <td>774501970389208065</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>9.312326</td>\n",
       "      <td>True</td>\n",
       "      <td>33</td>\n",
       "      <td>Single(1)</td>\n",
       "      <td>38.0</td>\n",
       "      <td>774501970389208065</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9.376260</td>\n",
       "      <td>True</td>\n",
       "      <td>31</td>\n",
       "      <td>Single(1)</td>\n",
       "      <td>40.0</td>\n",
       "      <td>774501970389208065</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>7.374463</td>\n",
       "      <td>True</td>\n",
       "      <td>31</td>\n",
       "      <td>Single(1)</td>\n",
       "      <td>38.0</td>\n",
       "      <td>774501970389208065</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>8.509843</td>\n",
       "      <td>False</td>\n",
       "      <td>34</td>\n",
       "      <td>Single(1)</td>\n",
       "      <td>38.0</td>\n",
       "      <td>774501970389208065</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   weight_pounds is_male  mother_age  plurality  gestation_weeks  \\\n",
       "0       7.874912    True          27  Single(1)             40.0   \n",
       "1       9.312326    True          33  Single(1)             38.0   \n",
       "2       9.376260    True          31  Single(1)             40.0   \n",
       "3       7.374463    True          31  Single(1)             38.0   \n",
       "4       8.509843   False          34  Single(1)             38.0   \n",
       "\n",
       "            hashmonth  \n",
       "0  774501970389208065  \n",
       "1  774501970389208065  \n",
       "2  774501970389208065  \n",
       "3  774501970389208065  \n",
       "4  774501970389208065  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "traindf.head()# Let's see a small sample of the training data now after our preprocessing\n",
    "traindf = preprocess(traindf)\n",
    "evaldf = preprocess(evaldf)\n",
    "traindf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>weight_pounds</th>\n",
       "      <th>is_male</th>\n",
       "      <th>mother_age</th>\n",
       "      <th>plurality</th>\n",
       "      <th>gestation_weeks</th>\n",
       "      <th>hashmonth</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>13400</th>\n",
       "      <td>6.624891</td>\n",
       "      <td>Unknown</td>\n",
       "      <td>17</td>\n",
       "      <td>Single(1)</td>\n",
       "      <td>39.0</td>\n",
       "      <td>6637442812569910270</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13401</th>\n",
       "      <td>5.313141</td>\n",
       "      <td>Unknown</td>\n",
       "      <td>30</td>\n",
       "      <td>Single(1)</td>\n",
       "      <td>40.0</td>\n",
       "      <td>6637442812569910270</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13402</th>\n",
       "      <td>8.750147</td>\n",
       "      <td>Unknown</td>\n",
       "      <td>22</td>\n",
       "      <td>Single(1)</td>\n",
       "      <td>40.0</td>\n",
       "      <td>6637442812569910270</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13403</th>\n",
       "      <td>8.366543</td>\n",
       "      <td>Unknown</td>\n",
       "      <td>32</td>\n",
       "      <td>Single(1)</td>\n",
       "      <td>38.0</td>\n",
       "      <td>6637442812569910270</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13404</th>\n",
       "      <td>8.311427</td>\n",
       "      <td>Unknown</td>\n",
       "      <td>31</td>\n",
       "      <td>Single(1)</td>\n",
       "      <td>41.0</td>\n",
       "      <td>6637442812569910270</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       weight_pounds  is_male  mother_age  plurality  gestation_weeks  \\\n",
       "13400       6.624891  Unknown          17  Single(1)             39.0   \n",
       "13401       5.313141  Unknown          30  Single(1)             40.0   \n",
       "13402       8.750147  Unknown          22  Single(1)             40.0   \n",
       "13403       8.366543  Unknown          32  Single(1)             38.0   \n",
       "13404       8.311427  Unknown          31  Single(1)             41.0   \n",
       "\n",
       "                 hashmonth  \n",
       "13400  6637442812569910270  \n",
       "13401  6637442812569910270  \n",
       "13402  6637442812569910270  \n",
       "13403  6637442812569910270  \n",
       "13404  6637442812569910270  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "traindf.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>weight_pounds</th>\n",
       "      <th>mother_age</th>\n",
       "      <th>gestation_weeks</th>\n",
       "      <th>hashmonth</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>26606.000000</td>\n",
       "      <td>26606.000000</td>\n",
       "      <td>26606.000000</td>\n",
       "      <td>2.660600e+04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>7.239026</td>\n",
       "      <td>27.343231</td>\n",
       "      <td>38.619409</td>\n",
       "      <td>4.404453e+18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>1.328190</td>\n",
       "      <td>6.170775</td>\n",
       "      <td>2.558954</td>\n",
       "      <td>2.784172e+18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.500449</td>\n",
       "      <td>12.000000</td>\n",
       "      <td>17.000000</td>\n",
       "      <td>1.244589e+17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>6.563162</td>\n",
       "      <td>22.000000</td>\n",
       "      <td>38.000000</td>\n",
       "      <td>1.622638e+18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>7.312733</td>\n",
       "      <td>27.000000</td>\n",
       "      <td>39.000000</td>\n",
       "      <td>4.329667e+18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>8.062305</td>\n",
       "      <td>32.000000</td>\n",
       "      <td>40.000000</td>\n",
       "      <td>7.108882e+18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>13.459221</td>\n",
       "      <td>50.000000</td>\n",
       "      <td>47.000000</td>\n",
       "      <td>9.183606e+18</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       weight_pounds    mother_age  gestation_weeks     hashmonth\n",
       "count   26606.000000  26606.000000     26606.000000  2.660600e+04\n",
       "mean        7.239026     27.343231        38.619409  4.404453e+18\n",
       "std         1.328190      6.170775         2.558954  2.784172e+18\n",
       "min         0.500449     12.000000        17.000000  1.244589e+17\n",
       "25%         6.563162     22.000000        38.000000  1.622638e+18\n",
       "50%         7.312733     27.000000        39.000000  4.329667e+18\n",
       "75%         8.062305     32.000000        40.000000  7.108882e+18\n",
       "max        13.459221     50.000000        47.000000  9.183606e+18"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Describe only does numeric columns, so you won't see plurality\n",
    "traindf.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h2> Write out </h2>\n",
    "<p>\n",
    "In the final versions, we want to read from files, not Pandas dataframes. So, write the Pandas dataframes out as CSV files. \n",
    "Using CSV files gives us the advantage of shuffling during read. This is important for distributed training because some workers might be slower than others, and shuffling the data helps prevent the same data from being assigned to the slow workers.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "traindf.to_csv('train.csv', index=False, header=False)\n",
    "evaldf.to_csv('eval.csv', index=False, header=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   6444 eval.csv\n",
      "  26606 train.csv\n",
      "  33050 total\n",
      "==> eval.csv <==\n",
      "7.25100379718,False,23,Single(1),39.0,7146494315947640619\n",
      "7.936641432,True,19,Single(1),39.0,6244544205302024223\n",
      "6.6248909731,True,27,Single(1),37.0,1891060869255459203\n",
      "7.31273323054,True,33,Single(1),38.0,2246942437170405963\n",
      "6.13326012884,True,42,Single(1),36.0,6365946696709051755\n",
      "7.43839671988,False,23,Single(1),40.0,4740473290291881219\n",
      "7.936641432,False,23,Single(1),40.0,7146494315947640619\n",
      "6.52788757782,True,39,Single(1),36.0,4740473290291881219\n",
      "6.75055446244,False,25,Single(1),34.0,8904940584331855459\n",
      "6.20821729792,False,33,Single(1),38.0,6365946696709051755\n",
      "\n",
      "==> train.csv <==\n",
      "7.87491199864,True,27,Single(1),40.0,774501970389208065\n",
      "9.31232594688,True,33,Single(1),38.0,774501970389208065\n",
      "9.37626000286,True,31,Single(1),40.0,774501970389208065\n",
      "7.3744626639,True,31,Single(1),38.0,774501970389208065\n",
      "8.5098433132,False,34,Single(1),38.0,774501970389208065\n",
      "7.3744626639,False,28,Single(1),39.0,774501970389208065\n",
      "7.1870697412,False,33,Single(1),38.0,774501970389208065\n",
      "8.75014717878,False,22,Single(1),41.0,774501970389208065\n",
      "7.35903030556,True,18,Single(1),42.0,774501970389208065\n",
      "6.686620406459999,False,30,Single(1),39.0,774501970389208065\n",
      "==> eval.csv <==\n",
      "7.31273323054,Unknown,24,Single(1),40.0,1639186255933990135\n",
      "6.3118345610599995,Unknown,40,Single(1),38.0,74931465496927487\n",
      "7.1870697412,Unknown,33,Single(1),37.0,74931465496927487\n",
      "8.24969784404,Unknown,33,Single(1),39.0,3182182455926341111\n",
      "8.0689187892,Unknown,24,Single(1),41.0,74931465496927487\n",
      "8.421658408399999,Unknown,32,Single(1),41.0,6910174677251748583\n",
      "6.80787465056,Unknown,25,Single(1),39.0,6141045177192779423\n",
      "6.8122838958,Unknown,39,Single(1),40.0,6141045177192779423\n",
      "7.16281889238,Unknown,22,Single(1),37.0,1639186255933990135\n",
      "7.5618555866,Unknown,30,Single(1),42.0,8904940584331855459\n",
      "\n",
      "==> train.csv <==\n",
      "8.000575487979999,Unknown,21,Single(1),39.0,6637442812569910270\n",
      "8.56275425608,Unknown,27,Single(1),40.0,6637442812569910270\n",
      "5.93704871566,Unknown,28,Single(1),36.0,6637442812569910270\n",
      "6.7902376696,Unknown,29,Single(1),40.0,6637442812569910270\n",
      "8.0358494499,Unknown,27,Single(1),41.0,6637442812569910270\n",
      "6.6248909731,Unknown,17,Single(1),39.0,6637442812569910270\n",
      "5.3131405142,Unknown,30,Single(1),40.0,6637442812569910270\n",
      "8.75014717878,Unknown,22,Single(1),40.0,6637442812569910270\n",
      "8.3665428429,Unknown,32,Single(1),38.0,6637442812569910270\n",
      "8.3114272774,Unknown,31,Single(1),41.0,6637442812569910270\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "wc -l *.csv\n",
    "head *.csv\n",
    "tail *.csv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Copyright 2020 Google Inc. Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
